/*** CREATES DEMOGRAPHIC TRACKING MEASURES BY CAMPUS-GRADE-YEAR ***/

clear all
set more 1

do "paths.do"
cd "$WORKING"

** set number of permutations to run
global NSIMRAND 1000
global NSIMPURP 1000
timer clear

/*
R-squared: Actual R2 from regression of prior scores on class indicators
Sorting index:
 Collins&Gan(2013) "does sorting students improve scores? An analysis of class composition"
 j-class, k-campus
 a_1k: s.d. of test scores within a school, defined as (1/N)*sum(deviations from overall mean)
 a_2k: square root of (enrollment weighted average of (variance of test scores in each class j))
 sort_k=a_1k/a_2k
 Note: we adjusted a_2k to be (1/N)*sum(deviations from class mean), so captures extent of
  tracking for typical student rather than typical classroom
In addition to actual measures, we create two simluated distributions for standardizing:
 i) Repeatedly randomly assigning students to classes maintaining actual numbers and sizes of classes
 ii) Repeatedly reordering classes and allocating students to these strictly by test score rank
 
Run time: around 3 days
*/


/************************************************
LOOP OVER YEAR AND GRADE
************************************************/

timer on 1

foreach vv in poc frpl {
	** loop over subject
	** foreach ss in math ela sci soc {
	foreach ss in math {
		** loop over year
		foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
			
			display "`vv' `ss' `year'"
			
			capture log close
			log using "$LOGFILES\tracking_8\tracking_8_`year'_`vv'_`ss'.log", replace

			** loop over grade
			forval g = 4(1)8{
			
				timer on 2
				
				timer on 3
				
				display "`vv' `ss' `year' `g'"
				
				foreach trk_lev in dis_sch sch_cls {
					
				/**************************************************************
				loop over tracking level
				school-within-district vs class-within-school
				**************************************************************/
				
				use "$WORKING\tracking_7 pieces\tracking_7_Z_`year'_`g'_`ss'.dta", clear
				
				keep distnum0 campus grade year Sclass_id Dblack Dhisp Pdisadv id1
				if ("`trk_lev'" == "dis_sch") {
					drop if distnum0 == .
					drop Sclass_id
					rename campus Sclass_id
					gen campus = distnum0
				}
				
				
				if ("`vv'" == "poc") {
					gen D_i = Dblack | Dhisp
					assert D_i != .
				}
				else {
					gen D_i = (Pdisadv > 0)
					assert D_i != .
				}
				
				disp "/*** CREATE VARIABLES WITH NUMBERS AND SIZES OF CLASSES BY CAMPUS ***/"
				
				** drop if class size=1
				bysort campus Sclass_id: gen tmp1=_N
				gen tmponestd=(tmp1==1)
				summ tmponestd
				drop if tmponestd==1

				** drop if only one class w/2+ students
				bysort campus: gen tmp2=_N
				gen tmponecls=(tmp1==tmp2)
				summ tmponecls
				drop if tmponecls==1
				drop tmp*

				** create identifiers and enrollment counts for schools and classes
				bysort campus Sclass_id: gen tmp1=_n
				count if tmp1==1
				sort campus Sclass_id
				egen gsch=group(campus)
				egen gcls=group(gsch Sclass_id)
				egen clstag=tag(gcls)
				egen numcls=total(clstag), by(gsch)
				bys gsch: gen schsz=_N
				bys gcls: gen clssz=_N
				summ g* numcls schsz clssz clstag
				* A = number of campuses
				summ gsch
				scalar A=r(max)
				* B = maximum number of classes for any school in grade
				summ numcls
				scalar B=r(max)
				drop Sclass_id tmp*

				** number classes w/in schools (from 1 to the # of classes)
				* here we number classes starting with the largest
				gsort gsch -clstag -clssz gcls
				bys gsch clstag: gen tmp1=_n if clstag==1
				egen clsnum=max(tmp1), by(gcls)
				egen tmp2=max(clsnum), by(gsch)
				assert tmp2==numcls
				drop tmp*
				 
				** create class size distribution variables
				* clsszX is size of class #X, and cumclssszX cumulates students across classes 1 to X
				forvalues i=1(1)`=B'{
				 local j=`i'-1
				 gen tmp1=clssz if clsnum==`i'
				 egen clssz`i'=max(tmp1), by(gsch)
				 if `i'==1{
				  gen cumclssz1=clssz1
				 }
				 if `i'>1{
				  gen cumclssz`i'=clssz`i'+cumclssz`j'
				 }
				 drop tmp*
				}
				summ clssz* cumclssz*

				disp "/*** CALCULATE ACTUAL SORTING AND R-SQUARED INDICES ***/"
				
				** overall std dev of scores
				egen tmp1=sd(D_i), by(gsch)
				* adjust std dev by dividing by N rather than N-1
				* in stata, variance is (1/(N-1))*(sum of deviations from the mean), and std dev is sqrt,
				* so need to adjust
				gen NUM=tmp1*sqrt((schsz-1)/schsz)

				** std dev of student scores from classroom averages
				egen tmp2=mean(D_i), by(gcls)
				qui replace tmp2=(D_i-tmp2)^2
				egen tmp3=sum(tmp2), by(gsch)
				qui replace tmp3=sqrt(tmp3/schsz)

				** calculate the sorting index
				drop tmp*

				** calculate the R-squared
				egen y_bar = mean(D_i), by(gsch)
				egen y_hat = mean(D_i), by(gcls)
				gen yh_yb2 = (y_hat - y_bar)^2
				gen y_yb2 = (D_i - y_bar)^2
				egen r2_num = sum(yh_yb2), by(gsch)
				egen r2_denom = sum(y_yb2), by(gsch)
				gen trackR = r2_num / r2_denom
				drop y_bar y_hat yh_yb2 y_yb2 r2_num r2_denom
				
				** recalculate the tracking measures using the minimum loop factors
				
				gen tmp1 = D_i ^ 2
				egen iota_s = sum(tmp1), by(gsch)
				drop tmp1

				egen tmp1 = sum(D_i), by(gsch)
				gen lambda_s = (tmp1 ^ 2) / schsz
				drop tmp1

				egen tmp1 = sum(D_i), by(gcls)
				gen tmp2 = (tmp1 / clssz) ^ 2
				egen kappa_s = sum(tmp2), by(gsch)
				drop tmp1 tmp2

				gen trackR_alt = (kappa_s - lambda_s) / (iota_s - lambda_s)
				// gen trackS_alt = sqrt(1 / (1 - trackR_alt))
				
				egen Ex = mean(D_i), by(gsch)
				egen Ex2 = mean(D_i ^ 2), by(gsch)
				egen Ex3 = mean(D_i ^ 3), by(gsch)
				egen Ex4 = mean(D_i ^ 4), by(gsch)
				
				** schsz >= 4, so all fourth moments are defined
				gen Exixixixj = (Ex * Ex3 * (schsz / (schsz - 1))) - (Ex4 / (schsz - 1))
				gen Exixixjxj = (Ex2 * Ex2 * (schsz / (schsz - 1))) - (Ex4 / (schsz - 1))
				gen Exixixjxk = ( ((schsz ^ 2) * (Ex ^ 2) * Ex2) + (2 * Ex4) - ///
						((schsz * (Ex2 ^ 2)) + (2 * schsz * Ex * Ex3)) ) / ///
						((schsz - 1) * (schsz - 2))
				gen Exixjxkxl = ( ((schsz ^ 3) * (Ex ^ 4)) + (8 * schsz * Ex * Ex3) + (3 * schsz * (Ex2 ^ 2)) - ///
						((6 * (schsz ^ 2) * (Ex ^ 2) * Ex2) + (6 * Ex4)) ) / ///
						((schsz - 1) * (schsz - 2) * (schsz - 3))
				
				** clssz >= 2, so psi_43 and psi_44 may equal zero
				egen tmp1 = sum(clssz ^ 2) if clstag, by(gsch)
				egen tmp2 = sum((clssz - 1) ^ 2) if clstag, by(gsch)
				gen psi_0 = (numcls * (numcls - 1))
				gen psi_1 = ((schsz - numcls) ^ 2) - tmp2
				gen psi_2 = (schsz ^ 2) + tmp2 - (((schsz - numcls) ^ 2) + tmp1 + psi_0)
				egen psi_3 = sum(1 / (clssz ^ 1)) if clstag, by(gsch)
				egen psi_43 = sum(((clssz - 1) * (clssz - 2) * (clssz - 3)) / (clssz ^ 1)) if clstag, by(gsch)
				egen psi_42 = sum(((clssz - 1) * (clssz - 2)) / (clssz ^ 1)) if clstag, by(gsch)
				egen psi_41 = sum(((clssz - 1)) / (clssz ^ 1)) if clstag, by(gsch)
				
				gen EK2 = ((psi_0 + (3 * psi_41)) * Exixixjxj) + ///
						((psi_1 + psi_43) * Exixjxkxl) + ///
						((psi_2 + (6 * psi_42)) * Exixixjxk) + ///
						(psi_3 * Ex4) + (4 * psi_41 * Exixixixj)

				gen trackRavg = (numcls - 1) / (schsz - 1)
				gen EK = lambda_s + (trackRavg * (iota_s - lambda_s))
				gen Erho2 = (EK2 + (lambda_s ^ 2) - (2 * lambda_s * EK)) / ((iota_s - lambda_s) ^ 2)
				gen Vrho = Erho2 - (trackRavg ^ 2)
				gen tmp3 = sqrt(Vrho)
				egen trackRsd = min(tmp3), by(gsch)
				drop tmp*
				
				drop psi_* EK2 EK Erho2 Vrho Exix*
				
				/*
				Technically this is the same as iota_s, but want a variable that is explicitly
				just the number of people with the attribute in the campus-grade-year.
				*/
				egen D_s = sum(D_i), by(gsch)
				
				save "$TEMPFILES\tracking_8_A.dta", replace
				
				timer off 3
				
				timer on 4
				
				disp "/*** SIMULATE INDICES UNDER RANDOM ASSIGNMENT ***/"
				
				set seed 1312018
				
				keep if clstag == 1
				
				save "$TEMPFILES\tracking_8_B.dta", replace
				
				// Aggregate up to school
				egen schtag = tag(gsch)
				keep if schtag == 1
				
				keep distnum0 campus grade year track*
// 				drop trackR_p
				
				save "$TEMPFILES\tracking_8_C.dta", replace
				
				timer off 4
				
				timer on 5
				
				disp "/*** SIMULATE INDICES UNDER PURPOSEFUL ASSIGNMENT ***/"
					
				use "$TEMPFILES\tracking_8_A.dta", clear
				
				/*
				won't have individual students or even classes later, so make
				student-weighted average class size now
				*/
				egen avgclssz=mean(clssz), by(campus grade year)
				
				// keep one obs per class
				keep if clstag == 1
				
				save "$TEMPFILES\tracking_8_D.dta", replace
				
				// Make a class-by-simulation-number level dataset
				gen nsim = $NSIMPURP
				expand nsim
				
				bysort gsch gcls: gen sim = _n
				sort gsch sim gcls
				gen tmp1 = uniform()
				egen clsrnk = rank(tmp1), by(gsch sim) unique

				sort gsch sim clsrnk
				by gsch sim: gen simcumclssz = sum(clssz)
				gen D_c = max(0, min(clssz, D_s - (simcumclssz - clssz)))
				
				// Calculate tracking measures from minimum loop factors
				gen tmp2 = (D_c ^ 2) / clssz
				egen kappa_sp = sum(tmp2), by(gsch sim)
				
				gen trackR_p = (kappa_sp - lambda_s) / (iota_s - lambda_s)
				
				// Aggregate up to school-by-simulation-number
				egen schsimtag = tag(gsch sim)
				keep if schsimtag == 1
				
				egen trackRavg_max = mean(trackR_p), by(gsch)
				egen trackRsd_max = sd(trackR_p), by(gsch)
				
				// Aggregate up to school
				egen schtag = tag(gsch)
				keep if schtag == 1
				
				keep distnum0 campus grade year schsz D_s track*
				drop trackR_p
				
				timer off 5
				
				timer on 6
				
				// merge in random assignment results and rename variables
				
				merge 1:1 distnum0 campus grade year using "$TEMPFILES\tracking_8_C.dta"
				assert _merge == 3
				drop _merge
				order distnum0 campus grade year schsz D_s track*
				
				save "$TEMPFILES\tracking_8_E.dta", replace
				
				la var trackR "actual R2 tracking measure (campus x grade x year)"
				la var trackR_alt "actual R2 tracking measure - alt calc"
				la var trackRavg "average R2 across simulations, randsom assn"
				la var trackRsd "std dev R2 across simulations, random assn"
				la var trackRavg_max "average R2 across simulations, full tracking"
				la var trackRsd_max "std dev R2 across simulations, full tracking"

				disp "/*** SAVE DATASET ***/"
				
				if ("`trk_lev'" == "dis_sch") {
					drop campus
					foreach aa in R R_alt Ravg Rsd Ravg_max Rsd_max {
						rename track`aa' dtrack`aa'
					}
				}
				
				save "$WORKING\tracking_8 results\tracking_8_`year'_`g'_`vv'_`ss'_`trk_lev'", replace
				
				timer off 6
				
				timer off 2
				
				timer list
				
				timer clear 2
				
				/**************************************************************
				loop over tracking level
				school-within-district vs class-within-school
				**************************************************************/
				
				}
			}
		}

		timer off 1

		timer list
		
	}
}

/************************************************
COMBINE SEPARATE YEAR-GRADE DATASETS AND ADD OTHER CAMPUS-GRADE-YEAR VARIABLES
dis_sch
************************************************/

foreach vv in poc frpl {
	

	** loop over years and grades to combine datasets
	local k = 0
	foreach ss in math {
		foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
			forval g=4(1)8{
				display "`year'"
				di `g' + 100
				local k = `k' + 1
				if (`k' == 1) {
					use "$WORKING\tracking_8 results\tracking_8_`year'_`g'_`vv'_`ss'_dis_sch", clear
					gen subj = "`ss'"
				}
				else {
					append using "$WORKING\tracking_8 results\tracking_8_`year'_`g'_`vv'_`ss'_dis_sch"
					replace subj = "`ss'" if subj == ""
				}
			}
		}
	}
	
	foreach rs in R {
		rename D_s `vv'
		rename dtrack`rs' dtrack`rs'_`vv'
		foreach as in avg sd avg_max sd_max {
			rename dtrack`rs'`as' dtrack`rs'`as'_`vv'
		}
	}
	
	save "$WORKING\temp\tracking_8_`vv'", replace
}

use "$WORKING\temp\tracking_8_poc", clear
merge 1:1 distnum0 grade year using "$WORKING\temp\tracking_8_frpl"
assert _merge == 3
drop _merge

save "$WORKING\tracking_8_dis_sch", replace


/************************************************
COMBINE SEPARATE YEAR-GRADE DATASETS AND ADD OTHER CAMPUS-GRADE-YEAR VARIABLES
sch_cls
************************************************/

foreach vv in poc frpl {
	
	** loop over years and grades to combine datasets
	local k = 0
	foreach ss in math {
		foreach year in 2011 2012 2013 2014 2015 2016 2017 2018 2019 {
			forval g=4(1)8{
				display "`year'"
				di `g' + 100
				local k = `k' + 1
				if (`k' == 1) {
					use "$WORKING\tracking_8 results\tracking_8_`year'_`g'_`vv'_`ss'_sch_cls", clear
					gen subj = "`ss'"
				}
				else {
					append using "$WORKING\tracking_8 results\tracking_8_`year'_`g'_`vv'_`ss'_sch_cls"
					replace subj = "`ss'" if subj == ""
				}
			}
		}
	}
	
	foreach rs in R {
		rename D_s `vv'
		rename track`rs' track`rs'_`vv'
		foreach as in avg sd avg_max sd_max {
			rename track`rs'`as' track`rs'`as'_`vv'
		}
	}
	
	save "$WORKING\temp\tracking_8_`vv'", replace
}


use "$WORKING\temp\tracking_8_poc", clear

merge 1:1 campus grade year using "$WORKING\temp\tracking_8_frpl"
assert _merge == 3
drop _merge

save "$WORKING\tracking_8.dta", replace


log close
